Load Datasets

train <- read_csv("data/train.csv") 
## Rows: 2018352 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (8): county, is_business, product_type, target, is_consumption, data_bl...
## dttm (1): datetime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mapping <- read_csv("data/weather_station_to_county_mapping.csv")
## Rows: 112 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): county_name
## dbl (3): longitude, latitude, county
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
historical_weather <- read_csv("data/historical_weather.csv")
## Rows: 1710802 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (17): temperature, dewpoint, rain, snowfall, surface_pressure, cloudcov...
## dttm  (1): datetime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
client <- read_csv("data/client.csv")
## Rows: 41919 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (6): product_type, county, eic_count, installed_capacity, is_business, ...
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gas_prices <- read_csv("data/gas_prices.csv")
## Rows: 637 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): forecast_date, origin_date
## dbl (3): lowest_price_per_mwh, highest_price_per_mwh, data_block_id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
electricity_prices <- read_csv("data/electricity_prices.csv")
## Rows: 15286 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): forecast_date, origin_date
## dbl (2): euros_per_mwh, data_block_id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
forecast_weather <- read_csv("data/forecast_weather.csv")
## Rows: 3424512 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (16): latitude, longitude, hours_ahead, temperature, dewpoint, cloudcov...
## dttm  (2): origin_datetime, forecast_datetime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Separate train data

train2 <- separate(train, datetime, c("date", "time"), sep = " ")
production <- train[train$row_id%%2 == 0,]
consumption <- train[train$row_id%%2 != 0,]

Boxplots

Looking at only noon of train data(high production time)

production <- train2[train2$row_id%%2 == 0,]
consumption <- train2[train2$row_id%%2 != 0,]
noonproduction <- production[production$time == "12:00:00",]
noonconsumption <- production[production$time == "12:00:00",]
np2 <- noonproduction[noonproduction$target < 1000,]
nc2 <- noonconsumption[noonconsumption$target < 5000,]
boxplot(np2$target~np2$is_business)

boxplot(nc2$target~nc2$is_business)

boxplot(np2$target~np2$product_type)

boxplot(nc2$target~nc2$product_type)

At noon, there doesn’t appear to be a particularly obvious difference in consumption or production(target) between businesses and non businesses, but there are more obvious differences between product types and consumption/production levels, such as clients with product type 3 tending to produce and consume more than other types.

hist(noonproduction$target)

favstats(noonproduction$target)
##  min  Q1 median      Q3      max     mean       sd     n missing
##    0 5.9 58.011 258.129 10976.49 284.4294 724.9304 42049       0

Data set reduction for graphing(looking at production)

I tried narrowing down the scope to only times in historical weather when there is solar radiation and the coordinates are in county 3, and narrowed the train(explore) data down only in region 3, production, when production is actually occurring

There are now considerably fewer cases in each

train <- read_csv("data/train.csv") 
## Rows: 2018352 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (8): county, is_business, product_type, target, is_consumption, data_bl...
## dttm (1): datetime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
explore <- train[train$county == 3,] #used later

historical_weather_sun <- historical_weather[historical_weather$direct_solar_radiation >0,]
historical_weather_sun3 <- historical_weather_sun[historical_weather_sun$latitude == 58.8
      & historical_weather_sun$longitude == 25.7,]
explore_production <- production[production$county == 3,]
explore_production_y <- explore_production[explore_production$target>0,]
explore_production_y$year <- format(explore_production_y$datetime, "%Y")
## Warning: Unknown or uninitialised column: `datetime`.
historical_weather_sun3$year <- format(historical_weather_sun3$datetime, "%Y")
historical_weather_sun3_2022 <- historical_weather_sun3[historical_weather_sun3$year==2022,]
explore_production_y_2022 <- na.omit(explore_production_y[explore_production_y$year==2022,])

historical_weather_3 <- historical_weather[historical_weather$latitude == 58.8
      & historical_weather$longitude == 25.7,]  #used later

Preliminary Merging

# Merging weather(cloudcover) into explore by datetime
explore_cloud <- left_join(explore, historical_weather_3 %>% distinct(datetime, cloudcover_total,temperature), by = "datetime")

# Merge gas prices
mgasprice <- gas_prices
mgasprice$data_block_id <- gas_prices$data_block_id - 1
explore_cloud <- left_join(explore_cloud, mgasprice %>% distinct(data_block_id, lowest_price_per_mwh, highest_price_per_mwh), by = "data_block_id")

#Merge electricity prices
electricity_prices$forecast_date <- as.POSIXct(electricity_prices$forecast_date, format = "%m/%d/%y %H:%M")
electricity_prices$datetime <- electricity_prices$forecast_date
explore_cloud <- left_join(explore_cloud, electricity_prices %>% distinct(datetime, euros_per_mwh), by = "datetime")

Plots (for region 3)

Production

explore <- explore_cloud
explore_production <- explore[explore$row_id%%2 == 0,]
explore_consumption <- explore[explore$row_id%%2 != 0,]
vars <- explore_production[,c("target", "cloudcover_total", "temperature","lowest_price_per_mwh", "highest_price_per_mwh", "euros_per_mwh")]

plot(explore_production$target ~ explore_production$cloudcover_total)

plot(explore_production$target ~ explore_production$temperature)

plot(explore_production$target ~ explore_production$lowest_price_per_mwh)#gas

plot(explore_production$target ~ explore_production$euros_per_mwh)#electricity

There appears to be some relationship between temperature and production, with higher production values present as temperature increases. Patterns are less clear/present for cloudcover and gas and electricity prices

Consumption

plot(explore_consumption$target ~ explore_consumption$cloudcover_total)

plot(explore_consumption$target ~ explore_consumption$temperature)

plot(explore_consumption$target ~ explore_consumption$lowest_price_per_mwh)#gas

plot(explore_consumption$target ~ explore_consumption$euros_per_mwh, xlim = c(0,1100))#electricity

Some interesting pattern can be observed in the consumption plots. To different degrees, their trend appear to be split into two different groups. This is especially apparent in the target vs temperature plot.

Searching for cause of consumption split

separate day and night

A rough separation of approximately when daylight may be present

explore_consumption2 <- separate(explore_consumption, datetime, c("date", "time"), sep = " ")
explore_consumption2$time <- as.POSIXct(explore_consumption2$time, format = "%H:%M:%S")
day <- seq(from = as.POSIXct("07:00:00", format = "%H:%M:%S"), to = as.POSIXct("17:00:00", format = "%H:%M:%S"), by = "1 hour")
plot(explore_consumption2$target ~ explore_consumption2$cloudcover_total,col =  ifelse(explore_consumption2$time %in% day,"red","blue"))
legend("topleft", legend = c("Day", "Night"), 
       col = c("red", "blue"), pch = 15)

plot(explore_consumption$target ~ explore_consumption$temperature,col =  ifelse(explore_consumption2$time %in% day,"red","blue"))

plot(explore_consumption$target ~ explore_consumption$lowest_price_per_mwh,col =  ifelse(explore_consumption2$time %in% day,"red","blue"))#gas

plot(explore_consumption$target ~ explore_consumption$euros_per_mwh)#electricity

Some disparities between the patterns of night and day can be observed, but they fail to explain the split in the data and only accentuate it.

Separate winter and summer

Once again, a rough separation to get a rough idea of the trends. Considers May through September to be summer, and everything else winter.

explore_consumption3 <- separate(explore_consumption2, date, c("year", "month", "day"), sep = "-")
explore_consumption3$month <- as.numeric(explore_consumption3$month)
summer <- 5:9
plot(explore_consumption3$target ~ explore_consumption3$cloudcover_total,col =  ifelse(explore_consumption3$month %in% summer,"red","blue"))
legend("topleft", legend = c("Summer", "Else"), 
       col = c("red", "blue"), pch = 15)

plot(explore_consumption3$target ~ explore_consumption3$temperature,col =  ifelse(explore_consumption3$month %in% summer,"red","blue"))

plot(explore_consumption3$target ~ explore_consumption3$lowest_price_per_mwh,col =  ifelse(explore_consumption3$month %in% summer,"red","blue"))#gas

plot(explore_consumption$target ~ explore_consumption$euros_per_mwh)#electricity

Once again, there are some trends, but none which explain the split.

Separate is_business

plot(explore_consumption$target ~ explore_consumption$cloudcover_total,col =  ifelse(explore_consumption$is_business == 1,"green", "orange"))
legend("topleft", legend = c("Business", "Else"), 
       col = c("green", "orange"), pch = 15)

plot(explore_consumption$target ~ explore_consumption$temperature,col =  ifelse(explore_consumption$is_business == 1,"green", "orange"))

plot(explore_consumption$target ~ explore_consumption$lowest_price_per_mwh,col =  ifelse(explore_consumption$is_business == 1,"green", "orange"))#gas

plot(explore_consumption$target ~ explore_consumption$euros_per_mwh)#electricity

Here, particularly with temperature and gas price, there is a clearly observable split between clients that are and are not businesses. Those which are not businesses tend to consume much less energy while still following the overall trend(ex.decreasing) on a much smaller scale.

Time Series Plots

train <- read_csv("data/train.csv") 
## Rows: 2018352 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (8): county, is_business, product_type, target, is_consumption, data_bl...
## dttm (1): datetime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
production <- train[train$row_id%%2 == 0,]
consumption <- train[train$row_id%%2 != 0,]

Consumption vs Production

production$datetime2 <- as.POSIXct(production$datetime, format = "%Y-%m-%d %H:%M:%S")
consumption$datetime2 <- as.POSIXct(consumption$datetime, format = "%Y-%m-%d %H:%M:%S")
plot(consumption$datetime2, consumption$target, type = "l", xlab = "Date", ylab = "Target", 
     main = "Energy Use(Target) by Date", col = rgb(1,0,0, alpha = 0.05))


lines(production$datetime2, production$target, col = rgb(0,0,1, alpha = 0.05))
legend("topleft", legend = c("Consumption", "Production"), 
       col = c(rgb(1, 0, 0, alpha = 0.5), rgb(0, 0, 1, alpha = 0.5)), pch = 15)

With this plot, it is clear that consumption and production highs alternate by season, with production being higher in the summer months and lower in the winter, while consumption is higher in the winter and lower in the summer.

This follows the sort of trends one would expect from Estonia.

Is business vs not

Consumption

consumption_b <- consumption[consumption$is_business == 1,]
consumption_n <- consumption[consumption$is_business == 0,]
consumption_b$datetime2 <- as.POSIXct(consumption_b$datetime, format = "%Y-%m-%d %H:%M:%S")
consumption_n$datetime2 <- as.POSIXct(consumption_n$datetime, format = "%Y-%m-%d %H:%M:%S")
plot(consumption_b$datetime2, consumption_b$target, type = "l", xlab = "Date", ylab = "Target", 
     main = "Energy Consumption(Target) by Date", col = rgb(1,0,0, alpha = 0.05))


lines(consumption_n$datetime2, consumption_n$target, col = rgb(0,0,1, alpha = 0.05))
legend("topleft", legend = c("Business", "Not"), 
       col = c(rgb(1, 0, 0, alpha = 0.5), rgb(0, 0, 1, alpha = 0.5)), pch = 15)

Similar to the trends seen before, businesses and non businesses follow similar trends as far as ups and downs go, but the scale of the consumption is much higher for businesses.

Production

production_b <- production[production$is_business== 1,]
production_n <- production[production$is_business== 0,]
production_b$datetime2 <- as.POSIXct(production_b$datetime, format = "%Y-%m-%d %H:%M:%S")
production_n$datetime2 <- as.POSIXct(production_n$datetime, format = "%Y-%m-%d %H:%M:%S")
plot(production_b$datetime2, production_b$target, type = "l", xlab = "Date", ylab = "Target", 
     main = "Energy Production(Target) by Date", col = rgb(1,0,0, alpha = 0.05))


lines(production_n$datetime2, production_n$target, col = rgb(0,0,1, alpha = 0.05))
legend("topleft", legend = c("Business", "Not"), 
       col = c(rgb(1, 0, 0, alpha = 0.5), rgb(0, 0, 1, alpha = 0.5)), pch = 15)

Whether a client is or is not a business appears to have little effect on energy production.

Looking at client

client_3 <- client[client$county == 3,] #there are 4 clients listed in county 3
exploresp <- separate(explore, datetime, c("date", "time"), sep = " ")

exploresp$date <- as.POSIXct(exploresp$date, format = "%Y-%m-%d")
client_3date <- as.POSIXct(client$date, format = "%Y-%m-%d")

explore_client <- left_join(exploresp, client_3 %>% distinct(date, is_business, product_type, eic_count, installed_capacity), by = c("date", "is_business", "product_type"))

expclient_pro <- explore_client[explore_client$is_consumption%%2 == 0,]
plot(expclient_pro$target ~ expclient_pro$eic_count)

plot(expclient_pro$target ~ expclient_pro$installed_capacity) #***

As one would expect, the higher the installed capacity, the higher the production can potentially be, leading to the patterned observed. A similar principle applies to EIC count. The bands likely result from each client having the same count/value for a number of different times before changing and maintaining that value for a while.

Merging train and client

train$datetime2 <- train$datetime
train <- separate(train, datetime2, c("date", "time"), sep = " ")

train$date <- as.POSIXct(train$date, format = "%Y-%m-%d")

train <- left_join(train, client %>% distinct(date, is_business, county, product_type, eic_count, installed_capacity), by = c("date", "is_business", "product_type", "county"))
production <- train[train$row_id%%2 == 0,]
consumption <- train[train$row_id%%2 != 0,]
plot(eic_count ~ installed_capacity, data = client)

plot(target ~ eic_count, data = production, main = "Production vs EIC")

plot(target ~ installed_capacity, data = production, main = "Production vs Installed Capacity")

plot(target ~ eic_count, data = consumption, main = "Consumption vs EIC")

plot(target ~ installed_capacity, data = consumption, main = "Consumption vs Installed Capacity")

Looking at installed capacity and eic count, it appears there may be some interaction with another variable in explaining consumption.

plot(target ~ eic_count, data = consumption, main = "Consumption vs EIC", col =  ifelse(consumption$is_business == 1,"green", "orange"))
legend("topleft", legend = c("Business", "Else"), 
       col = c("green", "orange"), pch = 15)

plot(target ~ installed_capacity, data = consumption, main = "Consumption vs Installed Capacity", col =  ifelse(consumption$is_business == 1,"green", "orange"))

Here it’s clear that there is interaction between is_business and installed_capacity/eic_count in explaining consumption

Data Merging

# merge with historical weather
historical_weather$latitude <- round(historical_weather$latitude,1)
historical_weather$longitude <- round(historical_weather$longitude,1)
hweather <- left_join(historical_weather, mapping %>% distinct(longitude, latitude, county), by = c("longitude", "latitude"))
hweather <- na.omit(hweather)

# aggregate temperatures by date and region
hweather$total_precipitation <- hweather$snowfall/10 + hweather$rain

hweather_av <- hweather %>% 
  group_by(datetime, county) %>% 
  summarize(temperature = mean(temperature))
## `summarise()` has grouped output by 'datetime'. You can override using the
## `.groups` argument.
hweather_rad <- hweather %>% 
  group_by(datetime, county) %>%  
  summarize(direct_solar_radiation = mean(direct_solar_radiation))
## `summarise()` has grouped output by 'datetime'. You can override using the
## `.groups` argument.
hweather_prec <- hweather %>% 
  group_by(datetime, county) %>%
  summarize(total_precipitation = mean(total_precipitation))
## `summarise()` has grouped output by 'datetime'. You can override using the
## `.groups` argument.
hweather_dew <- hweather %>% 
  group_by(datetime, county) %>%
  summarize(dewpoint = mean(dewpoint))
## `summarise()` has grouped output by 'datetime'. You can override using the
## `.groups` argument.
hweather_av$direct_solar_radiation <- hweather_rad$direct_solar_radiation
hweather_av$total_precipitation <- hweather_prec$total_precipitation
hweather_av$dewpoint <- hweather_dew$dewpoint

# add temp to train
train_hist <- left_join(train, hweather_av, by = c("county", "datetime"))
train_hist <- na.omit(train_hist)

explore

production_hist <- train_hist[train_hist$row_id%%2 == 0,]
consumption_hist <- train_hist[train_hist$row_id%%2 != 0,]
plot(production_hist$target ~ production_hist$temperature, col =  ifelse(production_hist$is_business == 1,"green", "orange"))
legend("topleft", legend = c("Business", "Else"), 
       col = c("green", "orange"), pch = 15)

plot(consumption_hist$target ~ consumption_hist$temperature, col =  ifelse(consumption_hist$is_business == 1,"green", "orange"))
legend("topleft", legend = c("Business", "Else"), 
       col = c("green", "orange"), pch = 15)

consumption weather graphs

plot(consumption_hist$target ~ consumption_hist$temperature)

plot(consumption_hist$target ~ consumption_hist$direct_solar_radiation)

plot(consumption_hist$target ~ consumption_hist$total_precipitation)

plot(consumption_hist$target ~ consumption_hist$dewpoint)

production weather graphs

plot(production_hist$target ~ production_hist$temperature)

plot(production_hist$target ~ production_hist$direct_solar_radiation)

plot(production_hist$target ~ production_hist$total_precipitation)

plot(production_hist$target ~ production_hist$dewpoint)

historical weather regression

hcon_lm <- lm(target ~ temperature + direct_solar_radiation + total_precipitation + dewpoint, data = consumption_hist)
summary(hcon_lm)
## 
## Call:
## lm(formula = target ~ temperature + direct_solar_radiation + 
##     total_precipitation + dewpoint, data = consumption_hist)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -556.9  -346.3  -260.6   -50.9 10496.6 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            445.11287    2.03378 218.860  < 2e-16 ***
## temperature            -10.27157    0.55091 -18.645  < 2e-16 ***
## direct_solar_radiation  -0.11057    0.01387  -7.971 1.58e-15 ***
## total_precipitation     19.82313    6.88282   2.880  0.00398 ** 
## dewpoint                 4.06715    0.57735   7.044 1.86e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 876.3 on 494985 degrees of freedom
## Multiple R-squared:  0.006117,   Adjusted R-squared:  0.006109 
## F-statistic: 761.7 on 4 and 494985 DF,  p-value: < 2.2e-16
hprod_lm <- lm(target ~ temperature + direct_solar_radiation + total_precipitation + dewpoint, data = production_hist)
summary(hprod_lm)
## 
## Call:
## lm(formula = target ~ temperature + direct_solar_radiation + 
##     total_precipitation + dewpoint, data = production_hist)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -869.9  -28.5   -2.5    9.1 8118.6 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -17.704500   0.547663  -32.33   <2e-16 ***
## temperature              9.379601   0.148350   63.23   <2e-16 ***
## direct_solar_radiation   1.103648   0.003736  295.45   <2e-16 ***
## total_precipitation     20.906221   1.853426   11.28   <2e-16 ***
## dewpoint                -9.578977   0.155472  -61.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 236 on 494985 degrees of freedom
## Multiple R-squared:  0.3244, Adjusted R-squared:  0.3244 
## F-statistic: 5.943e+04 on 4 and 494985 DF,  p-value: < 2.2e-16

Merging forecast weather

# orgainize mapping file
mapping <- na.omit(mapping)
mapping$longitude <- round(mapping$longitude,1)
mapping$latitude <- round(mapping$latitude,1)

# merge with forecast weather
forecast_weather$latitude <- round(forecast_weather$latitude,1)
forecast_weather$longitude <- round(forecast_weather$longitude,1)
fweather <- left_join(forecast_weather, mapping %>% distinct(longitude, latitude, county), by = c("longitude", "latitude"))
fweather <- na.omit(fweather)

# aggregate temperatures by date and region
fweather_av <- fweather %>% 
  group_by(forecast_datetime, county) %>% 
  summarize(temperature = mean(temperature))
## `summarise()` has grouped output by 'forecast_datetime'. You can override using
## the `.groups` argument.
fweather_rad <- fweather %>% 
  group_by(forecast_datetime, county) %>%  
  summarize(direct_solar_radiation = mean(direct_solar_radiation))
## `summarise()` has grouped output by 'forecast_datetime'. You can override using
## the `.groups` argument.
fweather_prec <- fweather %>% 
  group_by(forecast_datetime, county) %>%
  summarize(total_precipitation = mean(total_precipitation))
## `summarise()` has grouped output by 'forecast_datetime'. You can override using
## the `.groups` argument.
fweather_dew <- fweather %>% 
  group_by(forecast_datetime, county) %>%
  summarize(dewpoint = mean(dewpoint))
## `summarise()` has grouped output by 'forecast_datetime'. You can override using
## the `.groups` argument.
fweather_av$direct_solar_radiation <- fweather_rad$direct_solar_radiation
fweather_av$total_precipitation <- fweather_prec$total_precipitation
fweather_av$dewpoint <- fweather_dew$dewpoint

# add temp to train
train_for <- train
train_for$forecast_datetime <- train_for$datetime
train_for <- left_join(train_for, fweather_av, by = c("county", "forecast_datetime"))
train_for <- na.omit(train_for)

explore

production_for <- train_for[train_for$row_id%%2 == 0,]
consumption_for <- train_for[train_for$row_id%%2 != 0,]
plot(production_for$target ~ production_for$temperature, col =  ifelse(production_for$is_business == 1,"green", "orange"))
legend("topleft", legend = c("Business", "Else"), 
       col = c("green", "orange"), pch = 15)

plot(consumption_for$target ~ consumption_for$temperature, col =  ifelse(consumption_for$is_business == 1,"green", "orange"))
legend("topleft", legend = c("Business", "Else"), 
       col = c("green", "orange"), pch = 15)

consumption weather graphs

plot(consumption_for$target ~ consumption_for$temperature)

plot(consumption_for$target ~ consumption_for$direct_solar_radiation)

plot(consumption_for$target ~ consumption_for$total_precipitation)

plot(consumption_for$target ~ consumption_for$dewpoint)

production weather graphs

plot(production_for$target ~ production_for$temperature)

plot(production_for$target ~ production_for$direct_solar_radiation)

plot(production_for$target ~ production_for$total_precipitation)

plot(production_for$target ~ production_for$dewpoint)

forecast weather regression

fcon_lm <- lm(target ~ temperature + direct_solar_radiation + total_precipitation + dewpoint, data = consumption_for)
summary(fcon_lm)
## 
## Call:
## lm(formula = target ~ temperature + direct_solar_radiation + 
##     total_precipitation + dewpoint, data = consumption_for)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -693.1  -421.3  -323.9   -85.1 14920.7 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             5.375e+02  1.947e+00 276.099  < 2e-16 ***
## temperature            -1.592e+01  5.354e-01 -29.725  < 2e-16 ***
## direct_solar_radiation -5.982e-02  7.142e-03  -8.376  < 2e-16 ***
## total_precipitation     2.301e+04  5.641e+03   4.079 4.52e-05 ***
## dewpoint                7.449e+00  5.625e-01  13.244  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1205 on 990347 degrees of freedom
## Multiple R-squared:  0.006143,   Adjusted R-squared:  0.006139 
## F-statistic:  1530 on 4 and 990347 DF,  p-value: < 2.2e-16
fprod_lm <- lm(target ~ temperature + direct_solar_radiation + total_precipitation + dewpoint, data = production_for)
summary(fprod_lm)
## 
## Call:
## lm(formula = target ~ temperature + direct_solar_radiation + 
##     total_precipitation + dewpoint, data = production_for)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -726.3   -59.8     4.0    28.6 10587.8 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -5.221e+01  5.590e-01  -93.40   <2e-16 ***
## temperature             1.919e+01  1.538e-01  124.83   <2e-16 ***
## direct_solar_radiation  4.838e-01  2.051e-03  235.89   <2e-16 ***
## total_precipitation     4.949e+04  1.620e+03   30.55   <2e-16 ***
## dewpoint               -1.882e+01  1.615e-01 -116.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 345.9 on 990347 degrees of freedom
## Multiple R-squared:  0.1835, Adjusted R-squared:  0.1835 
## F-statistic: 5.563e+04 on 4 and 990347 DF,  p-value: < 2.2e-16